{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "cde8a9dc",
   "metadata": {},
   "source": [
    "# LangWatch Evaluation Tracking\n",
    "\n",
    "## Step 1: Define our LLM pipeline\n",
    "\n",
    "Let's create a simple RAG pipeline using LangChain, guaranteeing that we can get the output and the retrieved documents used during generation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a61c2b75",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "USER_AGENT environment variable not set, consider setting it to identify your requests.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-05-24 10:04:14,630 - langwatch.utils.initialization - INFO - Setting up LangWatch client...\n",
      "2025-05-24 10:04:14,636 - langwatch.client - INFO - Configuring OTLP exporter with endpoint: http://localhost:5560/api/otel/v1/traces\n",
      "2025-05-24 10:04:14,636 - langwatch.client - INFO - Registering atexit handler to flush tracer provider on exit\n",
      "2025-05-24 10:04:14,636 - langwatch.client - INFO - Successfully configured tracer provider with OTLP exporter\n",
      "2025-05-24 10:04:14,637 - langwatch.utils.initialization - INFO - LangWatch client setup complete\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/rp/9_s_f3kd1ssb089myww_p9zw0000gn/T/ipykernel_62985/2347759746.py:39: LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 1.0. Use :meth:`~invoke` instead.\n",
      "  retrieved_documents = retriever.get_relevant_documents(query)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "retrieved_documents: ['Introduction - LangWatchLangWatch home pageSearch...llms.txtSupportDashboardlangwatch/langwatchlangwatch/langwatchSearch...NavigationGet StartedIntroductionDocumentationOpen DashboardGitHub RepoGet StartedIntroductionSelf HostingCookbooksLLM ObservabilityOverviewConceptsLanguage APIs & SDKsUser EventsMonitoring & AlertsCode ExamplesLLM EvaluationOffline EvaluationReal-Time EvaluationList of EvaluatorsDatasetsAnnotationsLLM DevelopmentPrompt Optimization StudioDSPy VisualizationLangWatch MCPPrompt VersioningAPI EndpointsTracesPromptsAnnotationsDatasetsSupportTroubleshooting and SupportStatus PageGet StartedIntroductionCopy pageWelcome to LangWatch, the all-in-one open-source LLMops platform.LangWatch allows you to track, monitor, guardrail and evaluate your LLMs apps for measuring quality and alert on issues.\\nFor domain experts, it allows you to easily sift through conversations, see topics being discussed and annotate and score messages', 'For domain experts, it allows you to easily sift through conversations, see topics being discussed and annotate and score messages\\nfor improvement in a collaborative manner with the development team.\\nFor developers, it allows you to debug, build datasets, prompt engineer on the playground and\\nrun batch evaluations or DSPy experiments to continuously improve the product.\\nFinally, for the business, it allows you to track conversation metrics and give full user and quality analytics, cost tracking, build\\ncustom dashboards and even integrate it back on your own platform for reporting to your customers.\\nYou can sign up and already start the integration on our free tier by following the guides bellow:\\nPython Integration GuideTypeScript Integration GuideREST API\\nYou can also open the demo project check out a video on our platform.\\n\\u200bGet in touch\\nFeel free to reach out to us directly at [email\\xa0protected]. You can also open a GitHub issue', 'You can also open the demo project check out a video on our platform.\\n\\u200bGet in touch\\nFeel free to reach out to us directly at [email\\xa0protected]. You can also open a GitHub issue\\nto report bugs and request features, or join our Discord channel and ask questions directly for the community and the core team.Was this page helpful?YesNoOverviewgithubwebsitex-twitterlinkedinyoutubePowered by MintlifyOn this pageGet in touchAssistantResponses are generated using AI and may contain mistakes.']\n",
      "output: LangWatch is an open-source LLMops platform for tracking, monitoring, and evaluating LLM applications, helping improve quality and alert on issues.\n"
     ]
    }
   ],
   "source": [
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv(dotenv_path=\"langwatch/python-sdk/.env\")\n",
    "\n",
    "import langwatch\n",
    "\n",
    "from langchain.prompts import ChatPromptTemplate\n",
    "\n",
    "from langchain_community.document_loaders import WebBaseLoader\n",
    "from langchain_community.vectorstores.faiss import FAISS\n",
    "from langchain_core.vectorstores.base import VectorStoreRetriever\n",
    "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "from langchain.tools.retriever import create_retriever_tool\n",
    "from langchain.agents import AgentExecutor, create_tool_calling_agent\n",
    "from langchain.tools import BaseTool\n",
    "from langchain_core.documents import Document\n",
    "\n",
    "\n",
    "loader = WebBaseLoader(\"https://docs.langwatch.ai\")\n",
    "docs = loader.load()\n",
    "documents = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=1000, chunk_overlap=200\n",
    ").split_documents(docs)\n",
    "\n",
    "vector = FAISS.from_documents(documents, OpenAIEmbeddings())\n",
    "retriever = vector.as_retriever()\n",
    "\n",
    "retrieved_documents = []\n",
    "\n",
    "# Wrap the FAISS retriever so that we can capture which documents were used to generate the response\n",
    "@tool\n",
    "def langwatch_search(\n",
    "    query: str\n",
    ") -> list[Document]:\n",
    "    \"\"\"\"Search for information about LangWatch. For any questions about LangWatch, use this tool if you didn't already\"\"\"\n",
    "\n",
    "    global retrieved_documents\n",
    "    retrieved_documents = retriever.get_relevant_documents(query)\n",
    "    return retrieved_documents\n",
    "\n",
    "tools = [langwatch_search]\n",
    "model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\n",
    "            \"system\",\n",
    "            \"You are a helpful assistant that only reply in short tweet-like responses, use tools only once.\\n\\n{agent_scratchpad}\",\n",
    "        ),\n",
    "        (\"human\", \"{question}\"),\n",
    "    ]\n",
    ")\n",
    "agent = create_tool_calling_agent(model, tools, prompt)\n",
    "executor = AgentExecutor(agent=agent, tools=tools, verbose=False)  # type: ignore\n",
    "\n",
    "@langwatch.trace()\n",
    "def execute_rag_pipeline(question: str):\n",
    "    response = executor.invoke({\"question\": question})[\"output\"]\n",
    "    contexts = [d.page_content for d in retrieved_documents]\n",
    "    return response, contexts\n",
    "\n",
    "response, contexts = execute_rag_pipeline(\"What is LangWatch?\")\n",
    "\n",
    "print(\"\")\n",
    "print(\"retrieved_documents:\", contexts)\n",
    "print(\"output:\", response)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eac02587",
   "metadata": {},
   "source": [
    "## Step 2: Run the Offline Evaluation\n",
    "\n",
    "Now we can use the dataset we have from LangWatch to run a batch evaluation experiment through our LLM pipeline, to see the results and tweak it for optimizations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d47459a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Follow the results at: http://localhost:5560/inbox-narrator/experiments/my-incredible-experiment?runId=hopping-goat-of-glory\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "62ea227bee9b4f529f2c4efef7b93c7c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Failed to detach context\n",
      "Traceback (most recent call last):\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/opentelemetry/context/__init__.py\", line 155, in detach\n",
      "    _RUNTIME_CONTEXT.detach(token)\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/opentelemetry/context/contextvars_context.py\", line 53, in detach\n",
      "    self._current_context.reset(token)\n",
      "ValueError: <Token var=<ContextVar name='current_context' default={} at 0x10b2d4d10> at 0x31ab882c0> was created in a different Context\n",
      "Failed to detach context\n",
      "Traceback (most recent call last):\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/pydantic/type_adapter.py\", line 271, in _init_core_attrs\n",
      "    self.core_schema = _getattr_no_parents(self._type, '__pydantic_core_schema__')\n",
      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/pydantic/type_adapter.py\", line 55, in _getattr_no_parents\n",
      "    raise AttributeError(attribute)\n",
      "AttributeError: __pydantic_core_schema__\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/opentelemetry/context/__init__.py\", line 155, in detach\n",
      "    _RUNTIME_CONTEXT.detach(token)\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/opentelemetry/context/contextvars_context.py\", line 53, in detach\n",
      "    self._current_context.reset(token)\n",
      "ValueError: <Token var=<ContextVar name='current_context' default={} at 0x10b2d4d10> at 0x319a37e40> was created in a different Context\n",
      "Failed to detach context\n",
      "Traceback (most recent call last):\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/pydantic/type_adapter.py\", line 271, in _init_core_attrs\n",
      "    self.core_schema = _getattr_no_parents(self._type, '__pydantic_core_schema__')\n",
      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/pydantic/type_adapter.py\", line 55, in _getattr_no_parents\n",
      "    raise AttributeError(attribute)\n",
      "AttributeError: __pydantic_core_schema__\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/opentelemetry/context/__init__.py\", line 155, in detach\n",
      "    _RUNTIME_CONTEXT.detach(token)\n",
      "  File \"/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/.venv/lib/python3.11/site-packages/opentelemetry/context/contextvars_context.py\", line 53, in detach\n",
      "    self._current_context.reset(token)\n",
      "ValueError: <Token var=<ContextVar name='current_context' default={} at 0x10b2d4d10> at 0x31a879580> was created in a different Context\n"
     ]
    }
   ],
   "source": [
    "import langwatch\n",
    "import pandas as pd\n",
    "\n",
    "# Create a dataset\n",
    "df = pd.DataFrame(\n",
    "    [\n",
    "        {\n",
    "            \"question\": \"What is LangWatch?\",\n",
    "            \"answer\": \"LangWatch is a platform for evaluating and improving language models.\",\n",
    "        },\n",
    "        {\n",
    "            \"question\": \"How do I use LangWatch?\",\n",
    "            \"answer\": \"You can use LangWatch by installing the LangWatch SDK and then calling the LangWatch API.\",\n",
    "        },\n",
    "        {\n",
    "            \"question\": \"Does LangWatch support multiple language models?\",\n",
    "            \"answer\": \"Yes, LangWatch is compatible with all language models by using LiteLLM under the hood.\",\n",
    "        },\n",
    "        {\n",
    "            \"question\": \"Can I visualize evaluation metrics in LangWatch?\",\n",
    "            \"answer\": \"Yes, LangWatch provides dashboards for visualizing key evaluation metrics.\",\n",
    "        },\n",
    "        {\n",
    "            \"question\": \"Is there a free tier for LangWatch?\",\n",
    "            \"answer\": \"LangWatch offers a free tier with limited usage, ideal for small projects and evaluation.\",\n",
    "        },\n",
    "        {\n",
    "            \"question\": \"Where can I find documentation for LangWatch?\",\n",
    "            \"answer\": \"You can find the official documentation on the LangWatch website or GitHub repository.\",\n",
    "        },\n",
    "    ]\n",
    ")\n",
    "# Or retrieve it from LangWatch:\n",
    "# df = langwatch.dataset.get_dataset(\"CEtFivQeya4kyAzy9eJht\").to_pandas()  # dataset--rSAYL4HxQRXHSayq6c7A\n",
    "\n",
    "evaluation = langwatch.evaluation.init(\"my-incredible-experiment\")\n",
    "\n",
    "for index, row in evaluation.loop(df.iterrows()):\n",
    "    def evaluate(index, row):\n",
    "        response, contexts = execute_rag_pipeline(row[\"question\"])\n",
    "\n",
    "        evaluation.run(\n",
    "            \"ragas/faithfulness\",\n",
    "            name=\"Faithfulness\",\n",
    "            index=index,\n",
    "            data={\n",
    "                \"input\": row[\"question\"],\n",
    "                \"output\": response,\n",
    "                \"contexts\": contexts,\n",
    "            },\n",
    "            settings={\n",
    "                \"model\": \"openai/gpt-4o-mini\",\n",
    "                \"max_tokens\": 2048,\n",
    "                \"autodetect_dont_know\": True,\n",
    "            },\n",
    "        )\n",
    "    evaluation.submit(evaluate, index, row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9fa8b21",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
